package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo13Sample {
  def main(args: Array[String]): Unit = {
    // 随机取样

    val conf: SparkConf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("Demo13Sample")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc.textFile("spark/data/stu/students.txt")

    /**
     * withReplacement 有无放回
     * fraction 取样比例，由于是随机取样，所以只能大致接近这个比例
     * seed 随机数种子
     */
    stuRDD
      .sample(withReplacement = false, fraction = 0.01)
    //      .foreach(println)
    // 如果需要让每次取样的结果一致，则可以将seed设为一个固定值

    stuRDD.sample(withReplacement = false, fraction = 0.01, seed = 111).foreach(println)

  }

}
